0.1 Loading Data

wf <- st_read(file.path(shapeDir, "wholeFoods"))
## Reading layer `wholeFoods' from data source `/Users/kaz/Documents/Fall2020/WholeFoodsStudy/shape/wholeFoods' using driver `ESRI Shapefile'
## Simple feature collection with 2 features and 1 field
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -73.98821 ymin: 40.67497 xmax: -73.94837 ymax: 40.80765
## geographic CRS: WGS 84
wfHarlem <- wf %>% filter(store == "Whole Foods Harlem") %>% st_transform(2263)
wfHarlemBufferHalf <- wfHarlem %>% st_buffer(2640)
wfHarlemBufferOne <- wfHarlem %>% st_buffer(2640*2)
pluto <- st_read(file.path(shapeDir, "PLUTO_Dissolve")) %>% filter(Borough == "MN")
## Reading layer `PLUTO_Dissolve' from data source `/Users/kaz/Documents/Fall2020/WholeFoodsStudy/shape/PLUTO_Dissolve' using driver `ESRI Shapefile'
## Simple feature collection with 29922 features and 7 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 913128.9 ymin: 120049 xmax: 1067336 ymax: 272811.2
## projected CRS:  NAD83 / New York Long Island (ftUS)
tm_shape(pluto, bbox = st_bbox(wfHarlemBufferOne)) +
    tm_polygons() +
    tm_shape(wfHarlemBufferHalf) +
    tm_borders() +
    tm_shape(wfHarlemBufferOne) +
    tm_borders() +
    tm_shape(wfHarlem) +
    tm_symbols()

plutoWF <- pluto %>% filter(st_intersects(pluto, wfHarlemBufferOne,sparse = FALSE)) %>% 
    select(BoroName, Block) %>% 
    group_by(Block) %>% 
    summarize(BoroName = first(BoroName)) %>% 
    st_cast() %>% 
    ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)
plutoWF <- plutoWF %>% 
    filter(!Block %in% c(1111, 1897)) %>% 
    sf::st_join(wfHarlemBufferHalf %>%mutate(buffer = "Half"),join =st_within) %>%  
    dplyr::mutate(buffer = if_else(is.na(buffer), "Mile", buffer))

tm_shape(plutoWF) + 
    tm_polygons(col = "buffer")

0.2 Permits

permits <- read_csv(file.path(dataDir, "DOB_Permit_Issuance2010_2020.csv"))
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Bin #` = col_double(),
##   `Job #` = col_double(),
##   `Community Board` = col_double(),
##   `Zip Code` = col_double(),
##   `Bldg Type` = col_double(),
##   `Permittee's Phone #` = col_double(),
##   `Act as Superintendent` = col_logical(),
##   `Permittee's Other Title` = col_logical(),
##   PERMIT_SI_NO = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   COUNCIL_DISTRICT = col_double(),
##   CENSUS_TRACT = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 3527 parsing failures.
##  row                   col           expected actual                                                                                  file
## 3392 Act as Superintendent 1/0/T/F/TRUE/FALSE      Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## 3540 Act as Superintendent 1/0/T/F/TRUE/FALSE      Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## 5760 Act as Superintendent 1/0/T/F/TRUE/FALSE      Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## 7670 Act as Superintendent 1/0/T/F/TRUE/FALSE      Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## 8141 Act as Superintendent 1/0/T/F/TRUE/FALSE      Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## .... ..................... .................. ...... .....................................................................................
## See problems(...) for more details.
permits <- permits %>% 
    janitor::clean_names()%>% 
    select(borough, job_type,
           block, lot, permit_type,
           filing_date, latitude, longitude)

permits <- permits %>% 
    mutate(filing_date = lubridate::mdy_hms(filing_date),
           year = lubridate::year(filing_date),
           month = lubridate::month(filing_date))

permitsMN <- permits %>% filter(borough == "MANHATTAN")
rm(permits)

permitsWF <- permitsMN %>% mutate(Block = as.numeric(block)) %>% 
                         left_join(plutoWF %>% st_drop_geometry()) %>% 
    filter(!is.na(buffer))
## Joining, by = "Block"
permitsWF %>% 
    group_by( month, year, buffer) %>% 
    summarize(date = min(filing_date),
              count = n()) %>% 
    ggplot(aes(date, count)) +
    geom_point(aes(color = buffer)) +
    geom_smooth(aes(color = buffer)) + 
    #facet_wrap(~buffer) +
    theme(legend.position = "none")
## `summarise()` regrouping output by 'month', 'year' (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

0.3 Sales

sales <- read_csv(file.path(dataDir, "rollingSales","nycSales_2010_2019"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   NEIGHBORHOOD = col_character(),
##   `BUILDING CLASS CATEGORY` = col_character(),
##   `TAX CLASS AT PRESENT` = col_character(),
##   `EASE-MENT` = col_logical(),
##   `BUILDING CLASS AT PRESENT` = col_character(),
##   ADDRESS = col_character(),
##   `APARTMENT NUMBER` = col_character(),
##   `BUILDING CLASS AT TIME OF SALE` = col_character(),
##   `SALE DATE` = col_datetime(format = "")
## )
## See spec(...) for full column specifications.
## Warning: 1 parsing failure.
##    row       col           expected actual                                                                                 file
## 310313 EASE-MENT 1/0/T/F/TRUE/FALSE      E '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/rollingSales/nycSales_2010_2019'
mnSales <- sales %>% filter(BOROUGH == 1) %>% janitor:::clean_names()
rm(sales)
# mnSalesJoin <- plutoWF %>%     
#     janitor:::clean_names() %>% 
#     left_join(mnSales, by = c("block" = "block")) %>% 
#     tidyr::separate(address,  c("address","aptNum"), ",") %>% 
#     mutate(BL = paste0(block, stringr::str_pad(lot, 4,side = "left", pad = "0")),
#            bbl = paste0(1, stringr::str_pad(block, 5,side = "left", pad = "0"), stringr::str_pad(lot, 4,side = "left", pad = "0")),
#           aptNum = if_else(is.na(apartment_number), aptNum, apartment_number),
#           time = factor(if_else(sale_date < ymd("2017-01-01"), "Before WF", "After WF"), levels = c("Before WF", "After WF")))
# library(geoclient)
# 
# mnSalesJoin <- mnSalesJoin %>% filter(nchar(bbl) == 10) %>% 
#     mutate(bbl_use = geo_bbl(bbl)[["bbl"]])
mnSalesJoin <- readr::read_csv(file.path(dataDir, "mnSalesJoin.csv"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   borough.x = col_character(),
##   boro_name = col_character(),
##   ct2010_y = col_character(),
##   store = col_character(),
##   buffer = col_character(),
##   neighborhood = col_character(),
##   building_class_category = col_character(),
##   tax_class_at_present = col_character(),
##   ease_ment = col_logical(),
##   building_class_at_present = col_character(),
##   address = col_character(),
##   aptNum = col_character(),
##   apartment_number = col_character(),
##   building_class_at_time_of_sale = col_character(),
##   sale_date = col_datetime(format = ""),
##   geometry = col_character(),
##   time = col_character()
## )
## See spec(...) for full column specifications.

0.4 average change

mnSalesSummary <- mnSalesJoin %>% 
    #st_drop_geometry() %>% 
    filter(sale_price > 50000) %>% 
    mutate(month = month(sale_date),
           year = year(sale_date)) %>% 
    group_by(year, buffer) %>% 
    summarize(meanSale = mean(sale_price),
              medSale = median(sale_price),
              count = n())
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

1 average sale price

mnSalesSummary %>% 
    ggplot(aes(year, meanSale)) +
    geom_point(aes(color = buffer)) +
    geom_smooth(aes(color = buffer)) 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

    #facet_wrap(~buffer) +
mnSalesSummary %>% 
    ggplot(aes(year, medSale)) +
    geom_point(aes(color = buffer)) +
    geom_smooth(aes(color = buffer)) 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

mnSalesSummary %>% 
    ggplot(aes(year, count)) +
    geom_point(aes(color = buffer)) +
    geom_smooth(aes(color = buffer)) 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

1.1 Repeat sales

repeatLots <- mnSalesJoin %>% 
    #st_drop_geometry() %>% 
    filter(sale_price > 50000) %>% 
    group_by(BL, address, aptNum) %>% count() %>% 
    filter(n > 1) %>% 
    pull(BL)

mnSalesRepeat <- mnSalesJoin %>% filter(BL %in% repeatLots) %>% 
    arrange(BL)
mnSalesRepeat %>% 
    #st_drop_geometry() %>% 
    group_by(BL, buffer, time) %>% 
    summarize(last= last(sale_price)) %>% 
    ungroup() %>% 
    filter(last < 10000000, last > 50000) %>% 
    ggplot(aes(time, last)) +
    geom_point(aes(group = BL, color = time)) +
    geom_line(aes(group = BL), alpha = .2) +
    facet_wrap(~buffer) + 
    theme_minimal()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)

mnSalesRepeat %>% 
    #st_drop_geometry() %>% 
    group_by(BL, buffer, time) %>% 
    summarize(last= last(sale_price),
              class = first(building_class_at_present)) %>% 
    ungroup() %>% 
    filter(last < 10000000, last > 50000, class == "R4") %>% 
    ggplot(aes(time, last)) +
    geom_point(aes(group = BL, color = time)) +
    geom_line(aes(group = BL), alpha = .2) +
    facet_wrap(~buffer) + 
    theme_minimal()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)

1.2 Avg chanage R4

mnSalesRepeat %>% 
    #st_drop_geometry() %>% 
    filter(building_class_at_present == "R4") %>% 
    group_by(BL, buffer, time) %>% 
    summarize(last= last(sale_price),
              class = first(building_class_at_present)) %>% 
    ungroup() %>% 
    group_by(buffer,time) %>% 
    summarize(avgPrice = mean(last),
              medPrice = median(last)) %>% 
    ggplot(aes(time, avgPrice)) +
    geom_point(aes(group = buffer, color = buffer)) +
    geom_line(aes(group = buffer), alpha = .2) +
    theme_minimal()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
## `summarise()` regrouping output by 'buffer' (override with `.groups` argument)

## Median change R4

mnSalesRepeat %>% 
   # st_drop_geometry() %>% 
    filter(building_class_at_present == "R4") %>% 
    group_by(BL, buffer, time) %>% 
    summarize(last= last(sale_price),
              class = first(building_class_at_present)) %>% 
    ungroup() %>% 
    group_by(buffer,time) %>% 
    summarize(avgPrice = mean(last),
              medPrice = median(last)) %>% 
    ggplot(aes(time, medPrice)) +
    geom_point(aes(group = buffer, color = buffer)) +
    geom_line(aes(group = buffer), alpha = .2) +
    theme_minimal()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
## `summarise()` regrouping output by 'buffer' (override with `.groups` argument)

didreg = lm(last ~ buffer * time, data = mnSalesRepeat %>% 
    #st_drop_geometry() %>% 
    filter(building_class_at_present == "R4") %>% 
    group_by(BL, buffer, time) %>% 
    summarize(last= last(sale_price),
              class = first(building_class_at_present)) %>% 
    ungroup() %>% 
                mutate(buffer = if_else(buffer == "Mile", 0, 1),
                       time = if_else(time =="Before WF", 0, 1)))
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
summary(didreg)
## 
## Call:
## lm(formula = last ~ buffer * time, data = mnSalesRepeat %>% filter(building_class_at_present == 
##     "R4") %>% group_by(BL, buffer, time) %>% summarize(last = last(sale_price), 
##     class = first(building_class_at_present)) %>% ungroup() %>% 
##     mutate(buffer = if_else(buffer == "Mile", 0, 1), time = if_else(time == 
##         "Before WF", 0, 1)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1106681  -398308  -161098   194532 12571192 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   836808      58201  14.378  < 2e-16 ***
## buffer         78660     110677   0.711  0.47759    
## time          269872      98155   2.749  0.00618 ** 
## buffer:time  -169564     166041  -1.021  0.30763    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 873000 on 513 degrees of freedom
## Multiple R-squared:  0.0159, Adjusted R-squared:  0.01014 
## F-statistic: 2.762 on 3 and 513 DF,  p-value: 0.04151

1.3 panel data

didreg = lm(last ~ buffer * time, data = mnSalesRepeat %>% 
    #st_drop_geometry() %>% 
    group_by(BL, buffer, time) %>% 
    summarize(last= last(sale_price),
              class = first(building_class_at_present)) %>% 
    filter(last < 10000000, last > 50000)  %>% 
    tidyr::pivot_wider(names_from = time, values_from = last) %>% 
    filter(!is.na(`Before WF`), !is.na(`After WF`)) %>% 
    tidyr::pivot_longer(names_to = "time", values_to = "last", -c(BL, buffer, class)) %>% 
    ungroup() %>% 
                mutate(buffer = if_else(buffer == "Mile", 0, 1),
                       time = if_else(time =="Before WF", 0, 1)))
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
summary(didreg)
## 
## Call:
## lm(formula = last ~ buffer * time, data = mnSalesRepeat %>% group_by(BL, 
##     buffer, time) %>% summarize(last = last(sale_price), class = first(building_class_at_present)) %>% 
##     filter(last < 1e+07, last > 50000) %>% tidyr::pivot_wider(names_from = time, 
##     values_from = last) %>% filter(!is.na(`Before WF`), !is.na(`After WF`)) %>% 
##     tidyr::pivot_longer(names_to = "time", values_to = "last", 
##         -c(BL, buffer, class)) %>% ungroup() %>% mutate(buffer = if_else(buffer == 
##     "Mile", 0, 1), time = if_else(time == "Before WF", 0, 1)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1495641  -798879  -449368   211031  8302959 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1214542      96342  12.607  < 2e-16 ***
## buffer        -69930     173757  -0.402  0.68749    
## time          382499     136249   2.807  0.00516 ** 
## buffer:time    69715     245730   0.284  0.77674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1379000 on 588 degrees of freedom
## Multiple R-squared:  0.02139,    Adjusted R-squared:  0.0164 
## F-statistic: 4.284 on 3 and 588 DF,  p-value: 0.00527
didreg = lm(last ~ buffer * time, data = mnSalesRepeat %>% 
    #st_drop_geometry() %>% 
    group_by(BL, buffer, time) %>% 
    summarize(last= last(sale_price),
              class = first(building_class_at_present)) %>% 
    filter(last < 10000000, last > 50000)  %>% 
    tidyr::pivot_wider(names_from = time, values_from = last) %>% 
    filter(!is.na(`Before WF`), !is.na(`After WF`)) %>% 
    tidyr::pivot_longer(names_to = "time", values_to = "last", -c(BL, buffer, class)) %>% 
    ungroup() %>% filter(class == "R4") %>% 
                mutate(buffer = if_else(buffer == "Mile", 0, 1),
                       time = if_else(time =="Before WF", 0, 1)))
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
summary(didreg)
## 
## Call:
## lm(formula = last ~ buffer * time, data = mnSalesRepeat %>% group_by(BL, 
##     buffer, time) %>% summarize(last = last(sale_price), class = first(building_class_at_present)) %>% 
##     filter(last < 1e+07, last > 50000) %>% tidyr::pivot_wider(names_from = time, 
##     values_from = last) %>% filter(!is.na(`Before WF`), !is.na(`After WF`)) %>% 
##     tidyr::pivot_longer(names_to = "time", values_to = "last", 
##         -c(BL, buffer, class)) %>% ungroup() %>% filter(class == 
##     "R4") %>% mutate(buffer = if_else(buffer == "Mile", 0, 1), 
##     time = if_else(time == "Before WF", 0, 1)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -944759 -441560 -148154  175690 5350241 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   851759      84251  10.110  < 2e-16 ***
## buffer          4792     149444   0.032 0.974447    
## time          398000     119149   3.340 0.000962 ***
## buffer:time   -32740     211346  -0.155 0.877013    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 790300 on 254 degrees of freedom
## Multiple R-squared:  0.05768,    Adjusted R-squared:  0.04655 
## F-statistic: 5.183 on 3 and 254 DF,  p-value: 0.001716

fixed effects

library(lfe)
## Loading required package: Matrix
library(broom)
FEDF <- mnSalesRepeat %>% 
   # st_drop_geometry() %>% 
    mutate(year = year(sale_date)) %>% 
    group_by(BL, buffer, year) %>% 
    summarize(last= last(sale_price),
              class = first(building_class_at_present),
              time = first(time)) %>% 
    filter(last < 10000000, last > 50000) %>% 
                mutate(buffer = if_else(buffer == "Mile", 0, 1),
                       time = if_else(time =="Before WF", 0, 1),
                       did = buffer * time) %>% 
    ungroup()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
test <- lfe::felm(last~ did | BL + year, FEDF)

summary(test)
## 
## Call:
##    lfe::felm(formula = last ~ did | BL + year, data = FEDF) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3174888  -159237        0   139785  3767680 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)
## did   114965     106245   1.082    0.279
## 
## Residual standard error: 743400 on 1077 degrees of freedom
## Multiple R-squared(full model): 0.8633   Adjusted R-squared: 0.7534 
## Multiple R-squared(proj model): 0.001086   Adjusted R-squared: -0.8021 
## F-statistic(full model):7.856 on 866 and 1077 DF, p-value: < 2.2e-16 
## F-statistic(proj model): 1.171 on 1 and 1077 DF, p-value: 0.2795

1.4 Hedonic Pricing Model

1.4.1

harlemlots <- read_csv(file.path(dataDir, "modelling", "harlemSales_Attributes.csv"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   neighborhood = col_character(),
##   building_class_category = col_character(),
##   tax_class_at_present = col_character(),
##   building_class_at_present = col_character(),
##   address = col_character(),
##   building_class_at_time_of_sale = col_character(),
##   sale_date = col_date(format = ""),
##   BoroName = col_character()
## )
## See spec(...) for full column specifications.
harlemlots %>% group_by(year) %>% count() 
## # A tibble: 10 x 2
## # Groups:   year [10]
##     year     n
##    <dbl> <int>
##  1  2010   313
##  2  2011   451
##  3  2012   619
##  4  2013   705
##  5  2014   644
##  6  2015   650
##  7  2016   620
##  8  2017   542
##  9  2018   544
## 10  2019   573

1.5 make a model per year

valueFormula <- sale_price ~ 
    land_square_feet + gross_square_feet + year_built  + flagshipParkDist + parkDist + subwayDist +
    libDistance + popBlack/TotPop + popHisp/TotPop + avgHHsize + MedHHI + OwnerOccUnits/TotOccUnits + 
    OO_20k + OO20_49k + OO50_99k + OO50_99k + OO100_149k + OO150_299k + OO300_499k + OO_500_749k + 
    OO750_999k + OO1mil_ + permits - 1


permits <- permitsWF %>% 
    group_by(borough, Block, year) %>% 
    summarize(permits = n()) 
## `summarise()` regrouping output by 'borough', 'Block' (override with `.groups` argument)
harlemDF <- harlemlots%>% 
    filter(sale_price < 10000000, sale_price > 50000)  %>% 
    left_join(permits %>% select(-borough)) %>% 
    left_join(mnSalesJoin %>% select(bbl, block, lot) %>% distinct(.), by = c("Block" = "block", "lot" = "lot"))
## Adding missing grouping variables: `borough`
## Joining, by = c("Block", "year")
harlemDFScale <- harlemDF  %>% 
    mutate_at(vars(
                     land_square_feet , gross_square_feet , year_built  , flagshipParkDist , parkDist , subwayDist ,
                     libDistance , avgHHsize , MedHHI ,OO_20k, OO20_49k, OO50_99k, OO100_149k,OO150_299k,OO300_499k,
                     OO_500_749k, OO750_999k, OO1mil_, permits), ~(scale(.) %>% as.vector)) %>% 
    mutate(sale_price = harlemDF$sale_price) %>% 
    filter(complete.cases(.))

harlemDFModel <- harlemDFScale %>% tidyr::nest(-year)
## Warning: All elements of `...` must be named.
## Did you want `data = c(neighborhood, building_class_category, tax_class_at_present, 
##     Block, lot, building_class_at_present, address, zip_code, 
##     residential_units, commercial_units, total_units, land_square_feet, 
##     gross_square_feet, year_built, tax_class_at_time_of_sale, 
##     building_class_at_time_of_sale, sale_price, sale_date, BoroName, 
##     month, lat, long, HalfMile, flagshipParkDist, parkDist, subwayDist, 
##     libDistance, resArea, resAreaTot, resPercent, Geo_FIPS, TotPop, 
##     MalePop, FemalePop, pop_5, pop5_9, pop10_14, pop15_17, pop18_24, 
##     pop25_34, pop35_44, pop45_54, pop55_64, pop65_74, pop75_84, 
##     pop85_, popWhite, popBlack, popNatAm, PopAsian, popPac, popOther, 
##     popTwoRace, popNotHisp, popHisp, popHispWhite, popHispBlack, 
##     popHispNatAm, popHispAsian, popHisPac, popHispOther, popHispTwoRace, 
##     avgHHsize, pop25_, lessHSDeg, Hsdeg, someCol, bachelors, 
##     masters, profDeg, phd, MedHHI, TotOccUnits, RenterUnits, 
##     OwnerOccUnits, OO_20k, OO20_49k, OO50_99k, OO100_149k, OO150_299k, 
##     OO300_499k, OO_500_749k, OO750_999k, OO1mil_, borough, permits, 
##     bbl)`?
formulaList <- list(modlog = valueFormula)

library(purrr)
formulaDF <- map2_dfr(formulaList, formulaList %>% names(), function(x,y)tibble(name = y,formula = list(x)))%>%
    tidyr::pivot_wider(names_from = name, values_from = formula)
harlemDFModel <- harlemDFModel %>% rowwise() %>% 
    bind_cols(formulaDF) %>% 
    tidyr::pivot_longer(-c(year,data), names_to = "name", values_to = "formula")

modelFunc <- function(formula, data){
    mod <- lm(formula, data)
}
predictFunc <- function(mod, data)data %>% mutate(predict = as.vector(predict(mod,data)),
                                                  predictexp = exp(predict))
residFunc <- function(data)data %>% mutate(residual = sale_price - predict)
library(useful)
## glmnet
xFunc <- function(formula,data) build.x(formula, data, contrasts=FALSE, sparse=TRUE)
yFunc <- function(formula,data) build.y(formula, data) %>% as.numeric()
glmnetFunc <- function(x, y){
    set.seed(990)
    mod <- glmnet::glmnet(x=x, y=y,
                     family='gaussian', standardize = FALSE)
}
predictGlmnet <- function(mod, x)tibble(predictGLMNET = as.vector(predict(mod,x,s=0.01)),
                                        predictGLMENTexp = exp(predictGLMNET))


modList <- harlemDFModel %>% mutate(
                                lmmods = map2(formula,data, ~modelFunc(.x,.y)),
                                data = map2(lmmods, data,~predictFunc(.x,.y)),
                                data = map(data, ~residFunc(.x)),
                                x = map2(formula,data,~xFunc(.x,.y)),
                                y = map2(formula,data,~yFunc(.x,.y)),
                                glmnetMods = map2(x,y, ~glmnetFunc(.x,.y)),
                                dataGlmnet = map2(glmnetMods, x,~predictGlmnet(.x,.y)))

1.6 Diagnostics

modList$data[[1]] %>% ggplot(aes(x=residual)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

harlemPLUTO <- st_read(file.path(shapeDir, "harlemPoints"))
## Reading layer `harlemPoints' from data source `/Users/kaz/Documents/Fall2020/WholeFoodsStudy/shape/harlemPoints' using driver `ESRI Shapefile'
## Simple feature collection with 9581 features and 93 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 991882.8 ymin: 224362.6 xmax: 1005913 ymax: 240429.9
## projected CRS:  NAD83 / New York Long Island (ftUS)
map <- harlemPLUTO %>% inner_join(modList$data[[1]], by = c("BBL" = "bbl"))
    tm_shape(wfHarlemBufferOne) + 
    tm_borders() +
    tm_shape(wfHarlemBufferHalf) + 
    tm_borders() +
    tm_shape(map)  +
    tm_symbols(col = "residual") 
## Variable(s) "residual" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

1.7 predict all unit prices per year

allunitsDF <- harlemDFScale %>% select(c(land_square_feet, gross_square_feet, year_built, flagshipParkDist, parkDist, subwayDist,
                     libDistance, avgHHsize, popBlack, TotPop, popHisp, MedHHI, OwnerOccUnits, TotOccUnits, 
                     OO_20k, OO20_49k, OO50_99k, OO100_149k,OO150_299k,OO300_499k,
                     OO_500_749k, OO750_999k, OO1mil_, permits, bbl)) %>% mutate(sale_price = bbl) %>% distinct(.)

modList <- modList %>% mutate(predictData = map(lmmods,~predictFunc(.x,allunitsDF)),
                              xall = map(formula,~xFunc(.x,allunitsDF)),
                                dataGlmnetall = map2(glmnetMods, xall,~predictGlmnet(.x,.y)))
test <- modList %>% select(year, predictData) %>% tidyr::unnest(predictData) %>%
    select(year, bbl, predict,predictexp) %>%     
    bind_cols(modList %>% select(dataGlmnetall) %>% tidyr::unnest(dataGlmnetall)) %>% 
    left_join(mnSalesJoin %>% select(bbl, time, buffer), by = c("bbl" = "bbl")) %>% 
    group_by(year, buffer) %>% 
    summarize(meanPriceLM = mean(predict),
              medPriceLM = median(predict),
              meanPriceGL = mean(predictGLMNET),
              medPriceGL = median(predictGLMNET))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
testPlot <- test %>% tidyr::pivot_longer(names_to = "mods", values_to = "price", -c(year, buffer))
testPlot %>% 
    ggplot(aes(year, price, group = mods)) +
    geom_point(aes(group = mods, color = mods, shape=buffer)) +
    geom_line(aes(group = buffer, color = mods), alpha = .2 ) +
    facet_wrap(~mods) + 
    theme_minimal()

2 Fixed Effects with Hedonic Pricing Outcomes

FEDFHed <- modList %>% select(year, predictData) %>% tidyr::unnest(predictData) %>%
    select(year, bbl, predict,predictexp) %>%     
    bind_cols(modList %>% select(dataGlmnetall) %>% tidyr::unnest(dataGlmnetall)) %>% 
        left_join(mnSalesJoin %>% select(bbl,buffer), by = c("bbl" = "bbl")) %>% 
     mutate(time = if_else(year < 2017, "Before WF", "After WF")) %>% 

    filter(predict < 10000000, predict > 50000,predictGLMNET < 10000000, predictGLMNET > 50000) %>% 
                mutate(buffer = if_else(buffer == "Mile", 0, 1),
                       time = if_else(time =="Before WF", 0, 1),
                       did = buffer * time) %>% 
    ungroup()

feHedLM <- lfe::felm(predict~ did | bbl + year, FEDFHed)

summary(feHedLM)
## 
## Call:
##    lfe::felm(formula = predict ~ did | bbl + year, data = FEDFHed) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -8074762  -748610  -225584   456160  7286771 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## did   596044      22232   26.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1480000 on 136428 degrees of freedom
## Multiple R-squared(full model): 0.4048   Adjusted R-squared: 0.399 
## Multiple R-squared(proj model): 0.005241   Adjusted R-squared: -0.004486 
## F-statistic(full model):69.56 on 1334 and 136428 DF, p-value: < 2.2e-16 
## F-statistic(proj model): 718.8 on 1 and 136428 DF, p-value: < 2.2e-16
feHedGLM <- lfe::felm(predictGLMNET~ did | bbl + year, FEDFHed)

summary(feHedGLM)
## 
## Call:
##    lfe::felm(formula = predictGLMNET ~ did | bbl + year, data = FEDFHed) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1843631  -293620    18218   314650  3050052 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## did   161988       7650   21.18   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 509100 on 136428 degrees of freedom
## Multiple R-squared(full model): 0.5626   Adjusted R-squared: 0.5583 
## Multiple R-squared(proj model): 0.003276   Adjusted R-squared: -0.00647 
## F-statistic(full model):131.6 on 1334 and 136428 DF, p-value: < 2.2e-16 
## F-statistic(proj model): 448.4 on 1 and 136428 DF, p-value: < 2.2e-16

3 regular dnd regression

didreglm = lm(predict ~ buffer * time, data = FEDFHed)
summary(didreglm)
## 
## Call:
## lm(formula = predict ~ buffer * time, data = FEDFHed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3138149  -969180  -537636   315590  8657939 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1310244       6361  205.99   <2e-16 ***
## buffer        417263      14741   28.31   <2e-16 ***
## time         1100246      12481   88.15   <2e-16 ***
## buffer:time   363139      26871   13.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1820000 on 137759 degrees of freedom
## Multiple R-squared:  0.09128,    Adjusted R-squared:  0.09126 
## F-statistic:  4613 on 3 and 137759 DF,  p-value: < 2.2e-16
didregGLM = lm(predictGLMNET ~ buffer * time, data = FEDFHed)
summary(didregGLM)
## 
## Call:
## lm(formula = predictGLMNET ~ buffer * time, data = FEDFHed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1349178  -557740  -118913   426619  4018316 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1354230       2586  523.75   <2e-16 ***
## buffer        162794       5992   27.17   <2e-16 ***
## time          332040       5074   65.45   <2e-16 ***
## buffer:time   221489      10923   20.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 739600 on 137759 degrees of freedom
## Multiple R-squared:  0.06794,    Adjusted R-squared:  0.06792 
## F-statistic:  3347 on 3 and 137759 DF,  p-value: < 2.2e-16

4 LM plots

FEDFHed %>% select(year, bbl, predict, time, buffer) %>% 
    ggplot(aes(x=year, y=predict, group = bbl)) +
    geom_line(alpha = .1, aes(color = time)) +
    facet_wrap(~buffer) +
    theme_minimal()

FEDFHed %>% select(year, bbl, predict, time, buffer) %>% 
    ggplot(aes(x=year, y=predict)) +
    geom_hex(bins = 10) +
    facet_wrap(~buffer) +
    theme_minimal()

FEDFHed %>% select(year, bbl, predict, time, buffer) %>% 
    ggplot(aes(x=year, y=predict)) +
    geom_bin2d(bins = c(5,20)) +
    facet_wrap(~buffer) +
    theme_minimal()

5 glmnet plots

FEDFHed %>% select(year, bbl, predictGLMNET, time, buffer) %>% 
    ggplot(aes(x=year, y=predictGLMNET, group = bbl)) +
    geom_line(alpha = .1, aes(color = time)) +
    facet_wrap(~buffer) +
    theme_minimal()

FEDFHed %>% select(year, bbl, predictGLMNET, time, buffer) %>% 
    ggplot(aes(x=year, y=predictGLMNET)) +
    geom_hex(bins = 10) +
    facet_wrap(~buffer) +
    theme_minimal()

FEDFHed %>% select(year, bbl, predictGLMNET, time, buffer) %>% 
    ggplot(aes(x=year, y=predictGLMNET)) +
    geom_bin2d(bins = c(5,20)) +
    facet_wrap(~buffer) +
    theme_minimal()

5.1 Over buffers

for each distance for each year

## for each distance
distanceList <- as.list(seq(1320,4620, 660))

bufferFunction <- function(buffer, data, wf){
    varName  <- paste0("buffer_", buffer)
    wfBuffer <- wf %>% st_buffer(buffer)
    
    output <- data %>% 
    filter(!Block %in% c(1111, 1897)) %>% 
    sf::st_join(wfBuffer %>%mutate(!!varName := 1),join =st_within) %>%  
        st_drop_geometry() %>% 
        select(!!varName)
}

distanceBuffers <- map_dfc(distanceList, ~bufferFunction(.x, plutoWF, wfHarlem))
distanceBuffers <- distanceBuffers %>% replace(is.na(.), 0)
joinBuffer <- harlemDFScale %>% select(c(land_square_feet, gross_square_feet, year_built, flagshipParkDist, parkDist, subwayDist,
                     libDistance, avgHHsize, popBlack, TotPop, popHisp, MedHHI, OwnerOccUnits, TotOccUnits, 
                     OO_20k, OO20_49k, OO50_99k, OO100_149k,OO150_299k,OO300_499k,
                     OO_500_749k, OO750_999k, OO1mil_, permits, Block, bbl))  %>% distinct(.) %>% 
    left_join(plutoWF %>% bind_cols(distanceBuffers) %>% st_drop_geometry %>% distinct(.)) %>% 
    select(starts_with("buffer_"))
## Joining, by = "Block"
modBuffer <- modList %>% 
    mutate(joinBuffer = map(year, function(x) joinBuffer)) %>% 
    mutate(bufferDF = map2(.x = predictData,.y =  joinBuffer,.f =  function(x,y) tibble(x %>% bind_cols(y))))

modBufferPlot <- modBuffer %>% select(year, bufferDF) %>% tidyr::unnest(bufferDF) %>%
    select(year, bbl, predict, starts_with("buffer_")) %>%     
    bind_cols(modList %>% select(dataGlmnetall) %>% tidyr::unnest(dataGlmnetall)) %>% 
    select(-predictGLMENTexp) %>% 
    tidyr::pivot_longer(names_to = "bufferType", values_to = "treatment", -c(year, bbl, predict, predictGLMNET)) %>% 
    group_by(year, bufferType, treatment) %>% 
    summarize(meanPriceLM = mean(predict),
              medPriceLM = median(predict),
              meanPriceGL = mean(predictGLMNET),
              medPriceGL = median(predictGLMNET))
## `summarise()` regrouping output by 'year', 'bufferType' (override with `.groups` argument)
modBufferPlot %>% 
    tidyr::pivot_longer(names_to = "mods", values_to = "price", -c(year, bufferType, treatment)) %>% 
    mutate(treatment = as.factor(treatment)) %>% 
    ggplot(aes(year, price, group = mods)) +
    geom_point(aes(group = mods, color = mods, shape=treatment)) +
    geom_line(aes(group = treatment, color = mods), alpha = .2 ) +
    facet_grid(mods~bufferType, scales = "free_y") + 
    theme_minimal()+
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

FEDBufferDF <- modBuffer %>% select(year, bufferDF) %>% tidyr::unnest(bufferDF) %>%
    select(year, bbl, predict, starts_with("buffer")) %>%     
    bind_cols(modList %>% select(dataGlmnetall) %>% tidyr::unnest(dataGlmnetall) %>% select(predictGLMNET)) %>% 
    filter(predict < 10000000, predict > 50000,predictGLMNET < 10000000, predictGLMNET > 50000) %>% 
    ungroup()


fedDiagFunc <- function(distance, df, wfYear = 2017, FE = TRUE, mod = "LM"){
    varName = paste0("buffer_", distance)
    
    if(mod != "LM") FEDBufferDF <- FEDBufferDF %>% select(-predict) %>% rename(predict = predictGLMNET)
    
    modDF <- FEDBufferDF %>% select(predict, !!varName, bbl, year) %>% 
             mutate(time = if_else(year < wfYear, 0, 1)) %>%
        mutate(did = time * !!as.name(varName))
    
    if(FE) feHedLM <- lfe::felm(predict~ did | bbl + year, modDF)
    else feHedLM <- lfe::felm(predict~ did, modDF)

rmse <- sqrt(mean(feHedLM$residuals^2))
}

rmseVec <- map_dbl(distanceList, ~fedDiagFunc(.x, FEDBufferDF,wfYear = 2017, FE = TRUE, mod = "GLMNET"))

modGrid <- expand.grid(distance = seq(1320,4620, 660),
            year = seq(2011,2018,1),
            model = c("LM", "GLMENT"),stringsAsFactors = FALSE)
modGridList <- split(modGrid, seq(nrow(modGrid)))
rmseVec <- modGridList %>%  map_dbl(~fedDiagFunc(.x$distance, FEDBufferDF, .x$year,FE = TRUE, mod=.x$model))
modGrid <- modGrid %>% mutate(rmse = rmseVec)
modGrid %>% mutate(distance = as.factor(distance)) %>% 
    ggplot(aes(year, rmse, group = model, color = model)) + 
    geom_line() +
    facet_wrap(~distance) +
    theme_minimal()+
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

modGrid %>% mutate(distance = as.factor(distance)) %>% 
    ggplot(aes(year, rmse, group = model, color = model)) + 
    geom_line() +
    facet_grid(model~distance, scales = "free_y") +
    theme_minimal()+
      theme(axis.text.x = element_text(angle = 45, hjust = 1))